Mathematics & Physics Department - M.Sc in Data Science

Evolutionary Computation Methods for Financial Machine Learning



Genetic Algorithms for Hyperparameter and Network Topology Optimization

M.Sc Juan Francisco Munnoz-Elguezabal

May 2021 | Repository: Link


2. Install/Load Packages and Depedencies

In [ ]:
%%capture

# Install all the pip packages in the requirements.txt
import sys
!{sys.executable} -m pip install -r requirements.txt

2.2 Python Packages

In order to run this notebook, it is necessary to have installed and/or have the requirements.txt file with the following:

Generic
  • pandas>=1.1.2
  • numpy>=1.20
Data sources
  • ccxt>=1.42
Data Visualization
  • jupyter>=1.0.0
  • plotly>=4.10.0
  • pydot>=1.4.1
  • graphviz>=0.14.1
  • matplotlib>=3.3.4
  • seaborn>=0.11.1
Stats tools
  • statsmodels>=0.12.0
  • scikit-learn>=0.23.2
Evolutionary methods
  • gplearn>=0.4.1
  • sympy>=1.1.1
Machine Learning Models
  • tensorflow==2.4.1
  • keras>=2.4.3

Import scripts of project

In [1]:
# -- Import other scripts
import functions as fn
import visualizations as vs
import data as dt

# -- basic functions
import pandas as pd
import numpy as np
import random

# -- file operations
from os import listdir, path
from os.path import isfile, join

# -- complementary
from rich import print
from rich import inspect

# Reproducible results
random.seed(123)

# visualize plotly in notebook
import plotly.io as pio            # to define input-output of plots
pio.renderers.default = "notebook"  # to render the plot locally in your default web 
meh

LOAD PRICES, FOLDS AND PROCESS RESULTS DATA

In [2]:
# Route to backup files folder
dir_route = 'files/backups/ludwig/test_1_09042021/'

# Available files with experiment data
abspath = path.abspath(dir_route)
experiment_files = sorted([f for f in listdir(abspath) if isfile(join(abspath, f))])

# Experiments to show
# [11, 0.9, 0.5, 'all']

# Experiment file  
experiment = 11

# Fold case
fold_case = dt.fold_cases[experiment_files[experiment][0]]

# Final route
file_route = dir_route + experiment_files[experiment]

# Historical prices
historical_prices = dt.ohlc_data

# Timeseries data division in t-folds
folds = fn.t_folds(p_data=historical_prices, p_period=fold_case)

# Load data (previously generated results)
memory_palace = dt.data_pickle(p_data_objects=None, p_data_action='load', p_data_file=file_route)
memory_palace = memory_palace['memory_palace']

# List with the names of the models
ml_models = list(dt.models.keys())
/home/franciscome/.local/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning:

Trying to unpickle estimator LogisticRegression from version 0.24.1 when using version 0.24.0. This might lead to breaking code or invalid results. Use at your own risk.

TEXT DESCRIPTION OF EXPERIMENT

Description of results for the founded result TEXT Fold size, Cost function, feature transformation, train-val proportion, embargo

PLOT TIME SERIES BLOCK T-FOLDS

In [3]:
# Dates for vertical lines in the T-Folds plot
dates_folds = []
for n_fold in list(folds.keys()):
    dates_folds.append(folds[n_fold]['timestamp'].iloc[0])
    dates_folds.append(folds[n_fold]['timestamp'].iloc[-1])

# Plot_1 with t-folds vertical lines
plot_2 = vs.plot_ohlc(p_ohlc=historical_prices, p_theme=dt.theme_plot_2, p_vlines=dates_folds)

# Show plot in script
plot_2.show()

EXPERIMENT 1: OOS GENERALIZATION

In [4]:
# Filtered cases
filters = {'filter_1': {'metric': 'acc-train', 'objective': 'above_threshold', 'threshold': 0.90},
           'filter_2': {'metric': 'acc-val', 'objective': 'above_threshold', 'threshold': 0.50},
           'filter_3': {'metric': 'acc-diff', 'objective': 'all'}}

# metric type for MIN, MAX, MODE
metric_case = 'acc-train'

# -- get MIN, MAX, MODE, FILTERED Cases
met_cases = fn.model_cases(p_models=ml_models, p_global_cases=memory_palace, 
                           p_data_folds=folds, p_cases_type=metric_case, p_filters=filters)
 
for i in range(0, len(ml_models)):
    model_case = ml_models[i]
    
    # periods with at least 1 matching case
    filtered_periods = met_cases[model_case]['met_filter']['period']

    # in case of at least 1 period found
    if len(filtered_periods) > 0:
        data = []
        # concatenate all ocurrences
        for i_period in filtered_periods:
            data.append(met_cases[model_case]['met_filter']['data'][i_period]['metrics'])
            df_filtered = pd.concat(data, axis=1)

        # format to output dataframe
        df_filtered.index.name=model_case + '-metrics'

df_filtered.head()
Out[4]:
y_2012_0
ann-mlp-metrics
acc-train 0.902628
acc-val 0.518519
acc-mean 0.710574
acc-diff 0.384109
acc-weighted 0.412904

DATA PROFILES

In [5]:
# Fold to make a description
des_fold = df_filtered.columns[0][:-2]

# TABLE data profile (Target)
exp_train_y = memory_palace[des_fold]['features']['train_y']
exp_val_y = memory_palace[des_fold]['features']['val_y']

tabla_1 = fn.data_profile(p_data=exp_train_y, p_type='target', p_mult=10000)
display(tabla_1)
tabla_2 = fn.data_profile(p_data=exp_val_y, p_type='target', p_mult=10000)
display(tabla_2)
0    350
1    297
Name: cod_t1, dtype: int64
0    94
1    68
Name: cod_t1, dtype: int64
In [6]:
# TABLE data profile (Inputs)
exp_train_x = memory_palace[des_fold]['features']['train_x']
exp_val_x = memory_palace[des_fold]['features']['val_x']

tabla_3 = fn.data_profile(p_data=exp_train_x, p_type='target', p_mult=10000)
tabla_4 = fn.data_profile(p_data=exp_val_x, p_type='target', p_mult=10000)

PLOT: MULTI PLOT HISTOGRAMS

In [7]:
# PLOT histogram (Features)
plot_2_1 = vs.plot_h_histograms(p_data=exp_train_x.iloc[:, 0:9])

# Show plot
plot_2_1.show()

PLOT: HEATMAP CORRELATIONS

In [9]:
# -- Target and Auto regressive Features correlation
exp_1 = pd.concat([exp_train_y.copy(), exp_train_x.iloc[:, 0:55].copy()], axis=1)
exp_1_corr_p = exp_1.corr('pearson')
title_txt = 'Linear-Autoregressive Features Vs Target Correlation (pearson)'
exp_1_plot = vs.plot_heatmap_corr(p_data=exp_1_corr_p.copy(), p_title=title_txt)

# Show plot
exp_1_plot.show()
In [10]:
# -- Target and Symbolic Features correlation
exp_2 = pd.concat([exp_train_y.copy(), exp_train_x.iloc[:, -40:].copy()], axis=1)
exp_2_corr_p = exp_1.corr('pearson')
title_txt = 'Symbolic Features Vs Target Correlation (pearson)'
exp_2_plot = vs.plot_heatmap_corr(p_data=exp_1_corr_p.copy(), p_title=title_txt)

# Show plot
exp_2_plot.show()

PLOT: All ROCs in FOLD

In [11]:
# case to plot
case = 'met_max'

# data subset to use
subset = 'train'

# metric to use
metric_case = 'acc-train'

# Model to evaluate
model_case = 'ann-mlp'

# period 
period_case = 'y_2012'

# parameters of the evaluated models
d_params = memory_palace[period_case][model_case]['p_hof']['hof']

# get all fps and tps for a particular model in a particular fold
d_plot_4 = {i: {'tpr': memory_palace[period_case][model_case]['e_hof'][i]['metrics'][subset]['tpr'],
                'fpr': memory_palace[period_case][model_case]['e_hof'][i]['metrics'][subset]['fpr'],
                metric_case: memory_palace[period_case][model_case]['e_hof'][i]['pro-metrics'][metric_case]}
            for i in range(0, len(d_params))}

# Plot title
dt.theme_plot_4['p_labels']['title'] = 'in Fold max & min ' + metric_case + ' ' + subset + ' data'

# Timeseries of the AUCs
plot_4 = vs.plot_multiroc(p_data=d_plot_4, p_metric=metric_case, p_theme=dt.theme_plot_4)

# Show plot in script
plot_4.show()

PLOT: CLASSIFICATION FOLD RESULTS

In [12]:
# Pick case
case = 'met_max'

# Pick model to generate the plot
model_case = 'ann-mlp'

# memory_palace[period_case][model_case]['e_hof'][i]['pro-metrics'][metric_case]

# Generate title
plot_title = 'inFold ' + case + ' for: ' + model_case + ' ' + str(met_cases[model_case][case]['data']['pro-metrics']['acc-train'])
# Plot title
dt.theme_plot_3['p_labels']['title'] = plot_title

# Get data from met_cases
train_y = met_cases[model_case][case]['data']['results']['data']['train']

# Get data for prices and predictions
ohlc_prices = folds[met_cases[model_case][case]['period']]

ohlc_class = {'train_y': train_y['train_y'], 'train_y_pred': train_y['train_pred_y']}

# Dates for vertical lines in the T-Folds plot
date_vlines = [ohlc_class['train_y'].index[-1]]

# Make plot
plot_3 = vs.plot_ohlc_class(p_ohlc=ohlc_prices, p_theme=dt.theme_plot_3, p_data_class=ohlc_class, 
                            p_vlines=date_vlines)

# Show plot in script
plot_3.show()

MIN, MAX, MODE CASES

In [13]:
# models to explore results
model_case = ml_models[1]

# period of the best of HoF: according to model_case and metric_case 
maxcase_period = met_cases[model_case]['met_max']['period']
maxcase_params = met_cases[model_case]['met_max']['params']
maxcase_metric = met_cases[model_case]['met_max'][metric_case]

# output DataFrame
df_max = pd.DataFrame([met_cases[model_case]['met_max']['data']['pro-metrics']]).T
df_max.columns = [maxcase_period]
df_max.index.name=model_case + '-metrics'
df_max.head(5)
Out[13]:
y_2012
ann-mlp-metrics
acc-train 0.902628
acc-val 0.518519
acc-mean 0.710574
acc-diff 0.384109
acc-weighted 0.412904
In [14]:
# period of the worst of HoF: according to model_case and metric_case 
mincase_period = met_cases[model_case]['met_min']['period']
mincase_params = met_cases[model_case]['met_min']['params']
mincase_metric = met_cases[model_case]['met_min'][metric_case]

# output DataFrame
df_min = pd.DataFrame([met_cases[model_case]['met_min']['data']['pro-metrics']]).T
df_min.columns = [mincase_period]
df_min.index.name=model_case + '-metrics'
df_min.head(5)
Out[14]:
y_2010
ann-mlp-metrics
acc-train 0.613601
acc-val 0.617284
acc-mean 0.615444
acc-diff 0.003683
acc-weighted 0.307170
In [15]:
# Modes and their params, no. of repetitions and periods.
mode_repetitions = pd.DataFrame(met_cases[model_case]['met_mode']['data']).T
mode_repetitions.head(5)
Out[15]:
params repetitions periods
(2, 75, 'relu', [0, 0], [0, 0], 0.005, 0.055, 0.4) (2, 75, relu, [0, 0], [0, 0], 0.005, 0.055, 0.4) 0 [y_2009]
(2, 105, 'relu', [0, 0], [0, 0], 0.01, 0.055, 0.07) (2, 105, relu, [0, 0], [0, 0], 0.01, 0.055, 0.07) 0 [y_2009]
(2, 100, 'relu', [0, 0], [0, 0], 0.005, 0.055, 0.1) (2, 100, relu, [0, 0], [0, 0], 0.005, 0.055, 0.1) 0 [y_2009]
(2, 105, 'relu', [0, 0], [0, 0], 0.005, 0.055, 0.001) (2, 105, relu, [0, 0], [0, 0], 0.005, 0.055, 0... 0 [y_2009]
(2, 100, 'relu', [0, 0], [0, 0], 0.02, 0.05, 0.2) (2, 100, relu, [0, 0], [0, 0], 0.02, 0.05, 0.2) 0 [y_2009]